******************************************************************
* This file prepares the main analysis data for the
* Bartik IV analysis, "hierarchy_IV_data.dta"
******************************************************************
* Combine Bartik shocks, firm characteristics, and define
* other relevant firm-level trade outcomes (exports, offshoring).
* Then define main sample for regression analysis
******************************************************************
cd "${data_dir}"
use "tradedata.dta", clear

* to identify firms engaged in offshoring, check whether they import goods from the same industry
gen hs2 = substr(vare,1,2)
gen hs4 = substr(vare,1,4)
gen hs6 = substr(vare,1,6)
bys aar cvrnr ie hs6: gen evalhs6 = _n==1

preserve
* construct list of products imported per year:
keep if ie == "1"
bys aar cvrnr hs4: keep if _n == 1
sort aar cvrnr hs4
save firms_import_list_hs4.dta, replace
restore

preserve
* construct list of products imported per year:
keep if ie == "1"
bys aar cvrnr hs6: keep if _n == 1
sort aar cvrnr hs6
save firms_import_list_hs6.dta, replace
restore

merge n:1 aar cvrnr hs4 using firms_import_list_hs4.dta
gen offshore1 =( _merge == 3 & ie == "2")
drop _merge

merge n:1 aar cvrnr hs6 using firms_import_list_hs6.dta
gen offshore2 =( _merge == 3 & ie == "2")
drop _merge
set more off
bys aar cvrnr: egen offshoring_hs4 = max(offshore1)
bys aar cvrnr: egen offshoring_hs6 = max(offshore2)

* CPI deflator
gen CPI = 0
* base year is 2000
replace CPI = 4353/5253 if aar == "1991"
replace CPI = 4445/5253 if aar == "1992"
replace CPI = 4500/5253 if aar == "1993"
replace CPI = 4590/5253 if aar == "1994"
replace CPI = 4686/5253 if aar == "1995"
replace CPI = 4785/5253 if aar == "1996"
replace CPI = 4890/5253 if aar == "1997"
replace CPI = 4980/5253 if aar == "1998"
replace CPI = 5104/5253 if aar == "1999"
replace CPI = 5253/5253 if aar == "2000"
replace CPI = 5377/5253 if aar == "2001"
replace CPI = 5507/5253 if aar == "2002"
replace CPI = 5622/5253 if aar == "2003"
replace CPI = 5687/5253 if aar == "2004"
replace CPI = 5790/5253 if aar == "2005"
replace CPI = 5900/5253 if aar == "2006"
replace CPI = 6001/5253 if aar == "2007"
replace CPI = 6205/5253 if aar == "2008"
replace CPI = 6287/5253 if aar == "2009"
replace CPI = 6432/5253 if aar == "2010"

replace vrd = vrd / CPI

* total trade per firm-year:
collapse (sum) trade=vrd NHS6=evalhs6, by(aar cvrnr offshoring_hs4 offshoring_hs6 ie)

reshape wide trade NHS6, i(aar cvrnr offshoring_hs4 offshoring_hs6) j(ie) string

gen logimports = log(trade1)
gen logexports = log(trade2)

* add Bartik instruments:
merge 1:1 aar cvrnr using "bartik_instruments.dta"

tab aar _merge
* _merge = 2 means the firms interrupted its trading activities in some years
* _merge = 1 means it's the firm's first year of trade activities, for which we don't compute instruments

gen found = _merge
drop _merge 

* add firm characteristics:
sort aar cvrnr
merge 1:1 aar cvrnr using "firm_characteristics.dta"

tab aar _merge
drop _merge

* more preparations:
tostring aar, replace
encode aar, gen(t)
tab sector009, gen(sectordummy)
tab sector027, gen(subsectordummy)
tab aar, gen(yeardummy)
destring aar, replace

* Main Sample
gen exporter = logexports != .
gen logsales = log(sales)
gen sample = I_WID != . & I_transport_f2 != . & layers != . & logsales != .

* Define consistent samples for inequality analysis

* in levels
gen sample3 = sample == 1 & totalh != 0 & totalh != . ///
& log2inequality1 != . & log2inequality4 != . ///
& res2inequality1_model4 != . & res2inequality4_model4 != . ///
& wagegap3_topbottom_log != . & wagegap3_topbottom_res2 != . & correctlayers == 1
 
* in changes
gen sample4 = sample == 1 & dlog2inequality1 != . & dlog2inequality2 != . ///
& dlog2inequality3 != . & dlog2inequality4 != . & dlog2inequality5 != . ///
& dres2inequality1_model4 != . & dres2inequality4_model4 != . ///
& dgap3_log != . & dgap3_res2 != . & correctlayers == 1 & L.correctlayers == 1 

* Define core sample 
sort cvrnr aar
gen nextsample4 = sample4[_n+1] if aar == aar[_n+1]-1
tab aar sample3 if (sample4==1 | nextsample4 == 1)
gen samplecore3 = sample3 == 1 & (sample4 == 1 | nextsample4 == 1)

gen sample0 = samplecore3
gen samplecore4 = sample3 == 1 & sample4==1

xtset firmcode aar

save hierarchy_IV_data, replace

* define list of firms in the main sample:
keep if sample == 1
bys cvrnr: keep if _n==1
keep cvrnr sample
save firmsample, replace


